set more off
clear all
set type double
* Change working directory: "STATA Code for SDG Economy"

********************************************************************************
********************************************************************************
********************************************************************************
**** STATA Code for SDG Economy
**** Building the SDG economy: Needs, spending, and financing for universal achievement of the Sustainable Development Goals
**** Homi Kharas & John McArthur
**** Final data and do-files compiled by Selen Özdoğan
**** January 2020
********************************************************************************
**** 1. Estimate public spending on the SDG Economy
********************************************************************************
********************************************************************************
********************************************************************************


/* 
	NAMING CONVENTIONS: 
	a_   = filled in missing values with interpolation
	_alt = applied 1.13x GDP pc multiplier to later years
	l_   = natural log 
*/

********************************************************************************
**** Outline *******************************************************************
********************************************************************************
* 1. Denominator and classification data
* 		A . GDP, PPP (constant 2011 international $)
*		B . GDP, LCU current
*		C . GDP, US$ current
*		D . GDP deflator
*		E . GDP, (constant 2010 US$)
*		F . GDP growth rate out to 2030
*		G . Population
*		H . World Bank income groups (historic)
*		I . World Bank region codes (all income)
*		J . GDP ratios for standardizing 
* 2. Agriculture
*		A . FAOSTAT - Government Expenditure on Agriculture
*		B . IFPRI - SPEED
* 3. Health
*		A . WHO - Global Health Observatory data repository
* 4. Education
*		A . World Bank - WDI
* 5. Social spending
*		A . ILO - World Social Protection Report Data 2017-2019
* 6. Infrastructure
*		A . OECD - National Account Statistics
*		B . IMF - Investment and capital stock dataset
* 7. Conservation
*		A . Waldron et al. 2013
* 8. Justice (Public safety and order)
*		A . IMF - Government Finance Statistics
* 9. Merge all datasets together
*		A . Total country spending 
*		B . Total world spending in 2015 and 2030
*		C . Spending in 2015, 2030, and avg. 2015-2030, by income group
*		D . Spending in 2015 and 2030, by region
*		E . Per capita spending in 2015, 2025, 2030, and avg. 2015-2030, by sector
* 10. Fill in missing values
*		A . Fill in spending for missing sectors with regional income group averages, weighted by population
*		B . Per capita total, by income group, and by region
* 11. Building related datasets
*		A . Apply 1.13x GDP/pc multiplier to later years
*		B . Merge with DRM and ODA data

* Note: All WDI data downloaded on 10/10/2019

********************************************************************************
********************************************************************************
*** 1. Denominator and classification data
********************************************************************************
********************************************************************************

* Import various GDP estimates from WEO to fill in missing values from WDI in order to improve coverage
 * Downloaded 6/27/2019
 
import excel using "input\WEO_GDP_info.xlsx", firstrow clear
drop if SubjectDescriptor == ""

* Clean indicators and rename
foreach v of varlist G-AY{
replace `v' = "" if inlist(`v', "n/a", "--")
destring `v', replace
local x: variable label `v'
rename `v' ind`x'
}
rename *, lower
keep iso country subjectdescriptor units scale ind*
recast str3 iso

* Reshape to long
reshape long ind, i(iso country subjectdescriptor units scale) j(year)
egen id = group(subjectdescriptor units) // 1 = GDP pc, constant 2011 PPP; 2 = GDP LCU; 3 = GDP current USD; 4 = Population
tab id scale // 1 = Units fine as they are; 2 & 3 = Billions; 4 = Millions
keep iso country year ind id

* Reshape wide to one indicator per column
reshape wide ind, i(iso country year) j(id)

* Rename indicators
rename (iso country ind1 ind2 ind3 ind4) (countrycode countryname gdppc_ppp gdp_lcu_weo gdp_curr_weo pop)
* Proper units
replace gdp_lcu_weo = gdp_lcu_weo * 1000000000
replace gdp_curr_weo = gdp_curr_weo * 1000000000
replace pop = pop * 1000000

* Generate total GDP PPP
gen gdp_ppp_weo = gdppc_ppp * pop

* Clean
keep countrycode countryname year gdp_lcu_weo gdp_curr_weo gdp_ppp_weo
label var gdp_lcu_weo "Gross domestic product, current prices (National currency) WEO April 2019"
label var gdp_curr_weo "Gross domestic product, current prices (U.S. dollars) WEO April 2019"
label var gdp_ppp_weo "Gross domestic product, constant 2011 PPP WEO April 2019"

* Keep up to same years as WDI
keep if year>=2000 & year<=2017

save "output\GDP extra WEO.dta", replace

* Reshape wide to not disturb wide files
reshape wide gdp_lcu_weo gdp_curr_weo gdp_ppp_weo, i(countrycode countryname) j(year)
save "output\GDP extra WEOwide.dta", replace

********************************************************************************
** A . GDP, PPP (constant 2011 international $)
********************************************************************************
wbopendata, indicator(NY.GDP.MKTP.PP.KD) nometadata year(2000:2017) clear

drop if region == "Aggregates"
replace region = "EAP" if countryname == "Nauru"
drop if region == ""

foreach num of numlist 2000(1)2017{
rename yr`num' gdp_ppp`num'
label variable gdp_ppp`num' "`num' GDP, PPP (constant 2011 international $)"
}

keep countryname countrycode region gdp_ppp*

save "output\GDP_ppp2011constant.dta", replace

* Add IMF WEO data for countries with no data for 2015
use "output\GDP_ppp2011constant.dta", clear
merge 1:1 countrycode using "output\GDP extra WEOwide.dta", keepusing(gdp_ppp_weo*)
drop if _merge==2
drop _merge

* For individual countries, if coverage of series is greater with IMF, replace whole series with IMF
 * Count nonmissing observations
foreach v of varlist gdp_ppp2000-gdp_ppp_weo2017{
gen nm`v' = 1 if `v' !=.
}

* Count nonmissing obs by row (country)
egen wdi_count = rowtotal(nmgdp_ppp2000-nmgdp_ppp2017)
egen weo_count = rowtotal(nmgdp_ppp_weo2000-nmgdp_ppp_weo2017)

tab countryname if weo_count > wdi_count // These are the countries for which we can replace WDI series with WEO series

* Replace WDI series with whole WEO series where there is greater coverage
forval x = 2000/2017{
replace gdp_ppp`x' = gdp_ppp_weo`x' if weo_count > wdi_count
}
drop *_count gdp_ppp_weo* nm*

* Save plus aspect
save "output\GDP_ppp2011constantplus.dta", replace

reshape long gdp_ppp, i(countrycode) j(year)
save "output\GDP_ppp2011constantLONG.dta", replace

********************************************************************************
** B . GDP, LCU current
********************************************************************************
wbopendata, indicator(NY.GDP.MKTP.CN) nometadata year(2000:2017) clear

drop if region == "Aggregates"
replace region = "EAP" if countryname == "Nauru"
drop if region == ""

foreach num of numlist 2000(1)2017{
rename yr`num' gdp_lcu`num'
label variable gdp_lcu`num' "`num' GDP, LCU current"
}

keep countryname countrycode region gdp_lcu*

save "output\GDP_lcu_current.dta", replace

* Add IMF WEO data for countries with no data for 2015
use "output\GDP_lcu_current.dta", clear
merge 1:1 countrycode using "output\GDP extra WEOwide.dta", keepusing(gdp_lcu_weo*)
drop if _merge==2
drop _merge

* For individual countries, if coverage of series is greater with IMF, replace whole series with IMF
 * Count nonmissing observations
foreach v of varlist gdp_lcu2000-gdp_lcu_weo2017{
gen nm`v' = 1 if `v' !=.
}

* Count nonmissing obs by row (country)
egen wdi_count = rowtotal(nmgdp_lcu2000-nmgdp_lcu2017)
egen weo_count = rowtotal(nmgdp_lcu_weo2000-nmgdp_lcu_weo2017)

tab countryname if weo_count > wdi_count // These are the countries for which we can replace WDI series with WEO series

* Replace WDI series with whole WEO series where there is greater coverage
forval x = 2000/2017{
replace gdp_lcu`x' = gdp_lcu_weo`x' if weo_count > wdi_count
}
drop *_count gdp_lcu_weo* nm*

* Save plus aspect
save "output\GDP_lcu_currentplus.dta", replace

reshape long gdp_lcu, i(countrycode) j(year)
save "output\GDP_lcu_currentLONG.dta", replace

********************************************************************************
** C . GDP, US$ current
********************************************************************************
wbopendata, indicator(NY.GDP.MKTP.CD) nometadata year(2000:2017) clear

drop if region == "Aggregates"
replace region = "EAP" if countryname == "Nauru"
drop if region == ""

foreach num of numlist 2000(1)2017{
rename yr`num' gdp_us_curr`num'
label variable gdp_us_curr`num' "`num' GDP, US$ current"
}

keep countryname countrycode region gdp_us_curr*

save "output\GDP_US_current.dta", replace

* Add IMF WEO data for countries with no data for 2015
use "output\GDP_US_current.dta", clear
merge 1:1 countrycode using "output\GDP extra WEOwide.dta", keepusing(gdp_curr_weo*)
drop if _merge==2
drop _merge

* For individual countries, if coverage of series is greater with IMF, replace whole series with IMF
 * Count nonmissing observations
foreach v of varlist gdp_us_curr2000-gdp_curr_weo2017{
gen nm`v' = 1 if `v' !=.
}

* Count nonmissing obs by row (country)
egen wdi_count = rowtotal(nmgdp_us_curr2000-nmgdp_us_curr2017)
egen weo_count = rowtotal(nmgdp_curr_weo2000-nmgdp_curr_weo2017)

tab countryname if weo_count > wdi_count // These are the countries for which we can replace WDI series with WEO series

* Replace WDI series with whole WEO series where there is greater coverage
forval x = 2000/2017{
replace gdp_us_curr`x' = gdp_curr_weo`x' if weo_count > wdi_count
}
drop *_count gdp_curr_weo* nm*

* Save plus aspect
save "output\GDP_US_currentplus.dta", replace

reshape long gdp_us_curr, i(countrycode) j(year)
label variable gdp_us_curr "GDP, US$ current"
save "output\GDP_US_currentLONG.dta", replace

********************************************************************************
** D . GDP deflator
********************************************************************************
wbopendata, indicator(NY.GDP.DEFL.ZS) nometadata year(2010:2015) clear

keep if countrycode == "USA"
gen ratio_14_15 = yr2015 / yr2014 
 label variable ratio_14_15 "USA deflator 2015 / USA deflator 2014"
gen merge = 1

save "output\GDPratio_2014_to_2015.dta", replace

gen ratio_10_15 = yr2015 / yr2010
  label variable ratio_10_15 "USA deflator 2015 / USA deflator 2010"

save "output\GDPratio_2010_to_2015.dta", replace

gen ratio_13_15 = yr2015 / yr2013
 label variable ratio_13_15 "USA deflator 2015 / USA deflator 2013"
 
save "output\GDPratio_2013_to_2015.dta", replace  

********************************************************************************
** E . GDP, (constant 2010 US$)
********************************************************************************
wbopendata, indicator(NY.GDP.MKTP.KD) nometadata year(2000:2017) clear

drop if region == "Aggregates"
replace region = "EAP" if countryname == "Nauru"
drop if region == ""

foreach num of numlist 2000(1)2017{
rename yr`num' gdp_us_cons`num'
label variable gdp_us_cons`num' "`num' GDP, US$ constant 2010"
}

keep countryname countrycode region gdp_us_cons*

save "output\GDP_US_constant.dta", replace

reshape long gdp_us_cons, i(countrycode) j(year)

save "output\GDP_US_constantLONG.dta", replace

* Add deflator to convert from constant 2010 to constant 2015 USD
gen merge = 1
merge m:1 merge using "output\GDPratio_2010_to_2015.dta", keepusing(ratio_10_15)

gen gdp_us_cons15 = gdp_us_cons * ratio_10_15
 label variable gdp_us_cons15 "GDP, US$ constant 2015"

keep countryname countrycode year gdp_us_cons15 
save "output\GDP_US_constant15LONG.dta", replace

********************************************************************************
** F . GDP growth rate out to 2030
********************************************************************************
import excel "input\GDP growth rates - total and per capita.xlsx", sheet("Sheet1") firstrow clear
drop H I

* Clean country names to add countrycodes
CLEAN_COUNTRY_NAMES countryname

replace countryname = "Curaçao" if countryname=="Curacao"

* Add countrycodes
merge m:1 countryname using "output\UN_memberstates.dta"
drop if _merge==2
drop _merge

replace countrycode = "SWZ" if countryname=="Eswatini"

drop if countrycode == ""

* Use growth rates to extrapolate GDP (constant US$ 2015) out to 2030
merge 1:1 countrycode year using "output\GDP_US_currentLONG.dta", keepusing(countryname gdp_us_curr)
drop if _merge == 2
drop _merge
replace gdp_us_curr = . if year != 2015

encode countrycode, gen(id)
xtset id year

gen gdp_traj = gdp_us_curr

replace gdp_traj = gdp_us_curr 
replace gdp_traj = gdp_con_growth * L1.gdp_traj + L1.gdp_traj if gdp_traj == .

keep countryname countrycode year gdp_traj
label variable gdp_traj "GDP const. US$ 2015, 2030 trajectory"

save "output\gdp_cons_2030long.dta", replace

keep if year >= 2015 
reshape wide gdp_traj, i(countrycode) j(year)
egen gdp_traj_avg1530 = rowmean(gdp_traj2015-gdp_traj2030)

save "output\gdp_cons2030wide.dta", replace

********************************************************************************
** G . Population
********************************************************************************
wbopendata, indicator(SP.POP.TOTL) nometadata year(2000:2017) clear

drop if region == "Aggregates"
keep countryname countrycode yr*
rename yr* pop*

save "output\population_total.dta", replace

********************************************************************************
** H . World Bank income groups (historic)
********************************************************************************
import excel "https://databank.worldbank.org/data/download/site-content/OGHIST.xls", sheet("Country Analytical History") firstrow clear case(low) cellrange(A6:AG229)
drop if a == ""
rename a countrycode
rename dataforcalendaryear countryname

foreach var of varlist c-ag{
	local x : variable label `var'
	rename `var' incomegroup`x'
	}

foreach var of varlist incomegroup*{
	replace `var' = "" if `var' == ".."
	}	
	
for num 1987/2017: label variable incomegroupX "X incomegroup, calendar year"
	
save "output\WBincomegroups_all.dta", replace

********************************************************************************
** I . World Bank region codes (all income)
********************************************************************************
import excel "https://databank.worldbank.org/data/download/site-content/CLASS.xls", sheet("Groups") firstrow clear case(low)

keep if inlist(groupcode, "EAS", "ECS", "LCN", "MEA", "SAS", "SSF", "NAC")
keep groupcode countryname countrycode
rename groupcode region
save "output\WBregion.dta", replace

********************************************************************************
** J . GDP ratios for standardizing 
********************************************************************************

* GDP, PPP (constant 2011 international $) to GDP, USD (constant 2015 USD) in 2015
use "output\GDP_ppp2011constantplus.dta", clear
keep countryname countrycode gdp_ppp2015

merge 1:1 countrycode using "output\GDP_US_currentplus.dta", keepusing(gdp_us_curr2015)
drop _merge

gen ratio_ppp2011_k2015 = gdp_us_curr2015 / gdp_ppp2015
  label variable ratio_ppp2011_k2015 "2015 (Const. USD 2015) / 2015 (PPP int$ 2011)"

save "output\GDPratio_PPP2011_to_2015.dta", replace


********************************************************************************
********************************************************************************
**** 2. Agriculture
********************************************************************************
********************************************************************************

********************************************************************************
** A . FAOSTAT - Government Expenditure on Agriculture
** 		http://www.fao.org/faostat/en/#data/IG 
**		Downloaded on 1/11/2019
** 		Agriculture, forestry, fishing (General government)
** 		LCU, millions, current
********************************************************************************
import excel "input\FAOSTAT_agexpd(11January2019).xlsx", sheet("FAOSTAT_data_1-11-2019 (2)") firstrow case(lower) clear

keep area element item itemcode year unit value
rename area countryname

* Convert unit to LCU
replace value = value * 1000000
drop unit

replace item = "ag_ff" if itemcode == 23131
replace item = "env_pro" if itemcode == 23144
replace item = "ag_rd" if itemcode == 23143
drop itemcode

rename value fao
reshape wide fao, i(countryname item) j(year)
	
* Clean country names to add countrycodes
CLEAN_COUNTRY_NAMES countryname

replace countryname = "Curaçao" if countryname=="Curacao"
replace countryname = "Czech Republic" if countryname=="Czechia"
replace countryname = "Côte d'Ivoire" if countryname=="Cote d'Ivoire"
replace countryname = "Korea, Dem. People's Rep." if countryname=="Korea, Dem. People’s Rep."
replace countryname = "São Tomé and Principe" if countryname=="Sao Tome and Principe"
replace countryname = "Armenia" if countryname == "Armenia, Republic of"
replace countryname = "Azerbaijan" if countryname == "Azerbaijan, Republic of"
replace countryname = "Marshall Islands" if countryname == "Marshall Islands, Republic of"
replace countryname = "Serbia" if countryname == "Serbia, Republic of"

* Add countrycodes
rename countryname CountryName
merge m:1 CountryName using "input\unpop_731_clean.dta", keepusing(CountryCode)
drop if _merge==2
drop _merge

replace CountryCode = "KNA" if CountryName=="St. Kitts and Nevis"
replace CountryCode = "CHN" if CountryName=="China, mainland"
replace CountryCode = "SWZ" if CountryName=="Eswatini"

rename CountryName countryname
rename CountryCode countrycode	
drop if countrycode == ""

* Convert values to share of GDP (LCU current)
merge m:1 countrycode using "output/GDP_lcu_currentplus.dta"
keep if _merge != 2
drop _merge

foreach num of numlist 2001(1)2017{
gen fao_pctgdp`num' = fao`num' / gdp_lcu`num'
 label variable fao_pctgdp`num' "Ag spending as share of GDP"

* Replace zero values with missing 
replace fao_pctgdp`num' = . if fao_pctgdp`num' == 0 
}

* Identify most recent year with data

gen FAOrecentyear = .
 label variable FAOrecentyear "FAO (2018), Most recent year with data for indicator"
gen FAOrecentvalue = .
 label variable FAOrecentvalue "FAO (2018), Most recent observation for indicator"
 
foreach num of numlist 2017(-1)2001{
	replace FAOrecentyear = `num' if fao_pctgdp`num' != . & FAOrecentyear == .
	replace FAOrecentvalue = fao_pctgdp`num' if fao_pctgdp`num' != . & FAOrecentvalue == .
	}
	
* Calculate three-year moving average from most recent year with data

levelsof FAOrecentyear if FAOrecentyear >= 2003, local(levels)
	foreach rec of local levels{
	local prior = `rec' - 2
	egen FAOma_`prior'_`rec' = rowmean(fao_pctgdp`prior' - fao_pctgdp`rec') if FAOrecentyear == `rec'
	}
order FAOma_*, sequential

egen FAO_3yma_recent = rowmax (FAOma_*)
 label variable FAO_3yma_recent "FAO (2018) 3-year moving average from most recent available year"

* Mongolia has data only in 2002; cannot do 3-year average (no 2000 observations in FAO), replace with single observation
replace FAO_3yma_recent = fao_pctgdp2002 if countrycode == "MNG"
	
* 2030 trajectories applying 3-year recent avg. to 2015 and 2030 GDP (constant, 2015 USD)

merge m:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

* 2015-2030 + avg. 
for num 2015/2030: gen FAOgdp_X = FAO_3yma_recent * gdp_trajX
for num 2015/2030: label variable FAOgdp_X "Public spending on AG (FAO), X"
order FAOgdp_20*, sequential
egen FAOgdp_1530 = rowmean(FAOgdp_2015 - FAOgdp_2030)
 label variable FAOgdp_1530 "Public spending on Ag (FAO), avg. 2015-2030"
	
replace FAOrecentyear = 0 if FAOrecentyear == .
keep if item == "ag_ff"

save "output/faostat_onlyAGclean.dta", replace

********************************************************************************
** B . IFPRI - SPEED
** 		https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/INZ3QK 
**		Downloaded on 1/11/2019
** 		General government total expenditure on agriculture
** 		% of GDP
********************************************************************************
import excel "input\IFPRI_SPEED_full2015(11January2019).xlsx", sheet("gdpag_ppp") firstrow case(lower) clear

foreach var of varlist e-ak{
	local x : variable label `var'
	rename `var' IFPRI2_agGDP_tol`x'
	}

rename country countryname
rename iso countrycode

drop if countryname == ""

* Adjust percentages to be decimals
foreach num of numlist 1980(1)2012{
replace IFPRI2_agGDP_tol`num' = IFPRI2_agGDP_tol`num' / 100

* Replace zero values with missing 
replace IFPRI2_agGDP_tol`num' = . if IFPRI2_agGDP_tol`num' == 0
}

* Identify most recent year with data

gen IFPRI2recentyear = .
 label variable IFPRI2recentyear "IFPRI SPEED (2015), Most recent year with data for indicator"
gen IFPRI2recentvalue = .
 label variable IFPRI2recentvalue "IFPRI SPEED (2015), Most recent observation for indicator"
 
foreach num of numlist 2012(-1)2000{
	replace IFPRI2recentyear = `num' if IFPRI2_agGDP_tol`num' != . & IFPRI2recentyear == .
	replace IFPRI2recentvalue = IFPRI2_agGDP_tol`num' if IFPRI2_agGDP_tol`num' != . & IFPRI2recentvalue == .
	}
	
* Calculate three-year moving average from most recent year with data

levelsof IFPRI2recentyear if IFPRI2recentyear >= 2002, local(levels)
foreach rec of local levels{
	local prior = `rec' - 2
	egen IFPRI2ma_`prior'_`rec' = rowmean(IFPRI2_agGDP_tol`prior' - IFPRI2_agGDP_tol`rec') if IFPRI2recentyear == `rec'
	}
order IFPRI2ma_*, sequential

egen IFPRI2_3yma_recent = rowmax (IFPRI2ma_*)
 label variable IFPRI2_3yma_recent "IFPRI SPEED (2015) 3-year moving average from most recent available year"

* 2030 trajectories applying 3-year recent avg. to 2015 and 2030 GDP (constant, 2015 USD)

merge m:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

* 2015-2030 + avg. 
for num 2015/2030: gen IFPRI2gdp_X = IFPRI2_3yma_recent * gdp_trajX
for num 2015/2030: label variable IFPRI2gdp_X "Public spending on Ag (IFPRI), X"
order IFPRI2gdp_20*, sequential
egen IFPRI2gdp_1530 = rowmean(IFPRI2gdp_2015 - IFPRI2gdp_2030)
 label variable IFPRI2gdp_1530 "Public spending on Ag (IFPRI), avg. 2015-2030"	
	
replace IFPRI2recentyear = 0 if IFPRI2recentyear == .
drop if countrycode == ""
	
save "output/IFPRI2ag_clean.dta", replace	

* Supplement with Government Spending Watch (2019)
 * http://www.governmentspendingwatch.org/spending-data 
 * Downloaded on 1/31/2019
 * Planned government spending on agriculture
 * % of GDP
********************************************************************************

* Import files (stored by country)
local files: dir "input\GovSpendingWatch\" files "*.xlsx", respectcase
foreach file in `files'{
	import excel using "input/GovSpendingWatch/`file'", sheet("Agriculture") cellrange(A2:K57) firstrow clear 
	save "input/`file'_ag.dta", replace 

	keep if DataQuestion == "% GDP"
	gen countryname = "`file'"
	keep if A == "Agriculture expenditure"
	foreach var of varlist Planned-K{
		capture replace `var' = "." if `var' == "No Data"
		capture destring `var', replace
		}
	save "output/`file'_ag.dta", replace 
	} 

* Append country files, keeping only share of GDP line	
use "output/Afghanistan.xlsx_ag.dta", clear
local files: dir "input\GovSpendingWatch\" files "*.xlsx"
 foreach file in `files'{
	append using "output/`file'_ag.dta", force
	} 
save "output\AGcombined_govspendingwatch.dta", replace	


* Keep planned expenditure column (best coverage)
use "output\AGcombined_govspendingwatch.dta", clear
duplicates drop // Afghanistan

keep A DataQuestion Planned F I	countryname
rename Planned plan2015
 label variable plan2015 "2014/15 (2015) % of GDP"
rename F plan2016
 label variable plan2016 "2015/16 (2016) % of GDP"
rename I plan2017
 label variable plan2017 "2016/17 (2017) % of GDP"
 
* Calculate three-year moving average from most recent year with data

egen GSW_3yma_recent = rowmean(plan2015 - plan2017)
replace GSW_3yma_recent = GSW_3yma_recent / 100
 label variable GSW_3yma_recent "3-year average government spending on Agriculture (Gov. Spending Watch), % of GDP"

* Clean country names to add countrycodes
replace countryname = substr(countryname, 1, strlen(countryname) - 5)

* Clean country names to add countrycodes
CLEAN_COUNTRY_NAMES countryname

* Add countrycodes
merge 1:1 countryname using "output\UN_memberstates.dta", keepusing(countrycode)
drop if _merge==1
drop _merge 

keep countryname countrycode GSW_3yma_recent

* Supplement with World Bank (2014)
 * https://openknowledge.worldbank.org/bitstream/handle/10986/23506/Background0papers.pdf?sequence=1 - Pg. 29, section 46
 * Agriculture expenditure 
 * % of total expenditure
********************************************************************************
replace GSW_3yma_recent = .005552 if countrycode == "SDN"

* 2030 trajectories applying 3-year recent avg. to 2015 and 2030 GDP (constant, 2015 USD)

merge m:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

* 2015-2030 + avg.
for num 2015/2030: gen GSWgdp_X = GSW_3yma_recent * gdp_trajX
for num 2015/2030: label variable GSWgdp_X "Public spending on Ag (Gov. Spending Watch), X"
order GSWgdp_20*, sequential
egen GSWgdp_1530 = rowmean(GSWgdp_2015 - GSWgdp_2030)
 label variable GSWgdp_1530 "Public spending on Ag (Gov. Spending Watch), avg. 2015-2030"
	
gen GSWrecentyear = .
gen GSWrecentvalue = .

save "output/GSWag_clean.dta", replace	

* Merge three agriculture datasets (identify which has most recent data)

use "output/faostat_onlyAGclean.dta", clear
merge 1:1 countrycode using "output/IFPRI2ag_clean.dta", keepusing(countryname *recentvalue *recentyear IFPRI2_3yma_recent IFPRI2gdp_20* IFPRI2gdp_1530 gdp_traj*)
drop _merge
merge 1:1 countrycode using "output/GSWag_clean.dta", keepusing(countryname GSWrecentyear GSWrecentvalue GSW_3yma_recent GSWgdp_20* GSWgdp_1530 gdp_traj*)
drop _merge

* Limit to UN member states
merge 1:1 countrycode using "output\UN_memberstates.dta"
drop if _merge== 1
drop _merge

* Identify which of two datasets has most recent available observation for each country
 * If IFPRI and FAO are both missing an observation, use Government Spending Watch

gen agset = ""
replace IFPRI2recentyear = 0 if IFPRI2recentyear == .
replace agset = "FAO" if FAOrecentyear > IFPRI2recentyear & FAOrecentyear != . & FAO_3yma_recent != .
replace agset = "IFPRI2" if agset == "" & IFPRI2recentyear != . & IFPRI2recentyear != 0 & IFPRI2_3yma_recent != .
replace agset = "GSW" if agset == "" & GSW_3yma_recent != .

* Fill overall ag spending variables using dataset with most recent available observation identified in A

gen agyear = .
gen agvalue = .
gen ag3yma_recent = .
 label variable ag3yma_recent "Agriculture 3-year average % of GDP"
for num 2015/2030: gen aggdp_X = . 
for num 2015/2030: label variable aggdp_X "Public spending on agriculture, X"
gen aggdp_1530 = .
 label variable aggdp_1530 "Public spending on agriculture, avg. 2015-2030"
 
levelsof agset, local(levels)
	foreach i of local levels{	
		replace agyear = `i'recentyear if agset == "`i'"
		replace agvalue = `i'recentvalue if agset == "`i'"
		replace ag3yma_recent = `i'_3yma_recent if agset == "`i'"
		for num 2015/2030: replace aggdp_X = `i'gdp_X if agset == "`i'"
		replace aggdp_1530 = `i'gdp_1530 if agset == "`i'"
		}
		
replace agset = "WB Sudan" if countrycode == "SDN"		

* Calculate total spending on agriculture in 2015 and 2030

	* 2015
	egen tolaggdp_2015 = sum(aggdp_2015)
	sort tolaggdp_2015
	carryforward tolaggdp_2015, replace
	 label variable tolaggdp_2015 "World total public spending on Ag, 2015"

	* 2030
	egen tolaggdp_2030 = sum(aggdp_2030)
	sort tolaggdp_2030
	carryforward tolaggdp_2030, replace
	 label variable tolaggdp_2030 "World total public spending on Ag, 2030"

save "output/ag_shGDP_combined.dta", replace		


********************************************************************************
********************************************************************************
**** 3. Health
********************************************************************************
********************************************************************************

********************************************************************************
** A . WHO - Global Health Observatory data repository
** 		http://apps.who.int/gho/data/view.main.GHEDGGHEDGDPSHA2011v
**		Downloaded on 1/11/2019
** 		Domestic general government health expenditure
** 		% of GDP
********************************************************************************

import excel "input\WHO_gge_healthGDP(15January2019).xlsx", sheet("data") cellrange(A2:Q196) firstrow case(lower) clear

rename country countryname

foreach var of varlist b-q{
	replace `var' = "" if `var' == "No data"
	destring `var', replace
	local x : variable label `var'
	rename `var' health`x'
	}

* Fix entries for Australia and Japan. 2015 value is 0 due to likely database error	
replace health2015 = health2014 if inlist(countryname, "Australia", "Japan")
	
* Clean country names
CLEAN_COUNTRY_NAMES countryname

replace countryname = "Curaçao" if countryname=="Curacao"
replace countryname = "Czech Republic" if countryname=="Czechia"
replace countryname = "Côte d'Ivoire" if countryname=="Cote d'Ivoire"
replace countryname = "Korea, Dem. People's Rep." if countryname=="Korea, Dem. People’s Rep."
replace countryname = "São Tomé and Principe" if countryname=="Sao Tome and Principe"
replace countryname = "Armenia" if countryname == "Armenia, Republic of"
replace countryname = "Azerbaijan" if countryname == "Azerbaijan, Republic of"
replace countryname = "Marshall Islands" if countryname == "Marshall Islands, Republic of"
replace countryname = "Serbia" if countryname == "Serbia, Republic of"
replace countryname = "St. Vincent and the Grenadines" if countryname == "St. Vincent and Genadines"

* Add countrycodes
rename countryname CountryName
merge m:1 CountryName using "input\unpop_731_clean.dta", keepusing(CountryCode)
drop if _merge==2
drop _merge	

replace CountryCode = "KNA" if CountryName=="St. Kitts and Nevis"
replace CountryCode = "LCA" if CountryName=="St. Lucia"
replace CountryCode = "SWZ" if CountryName=="Eswatini"
drop if CountryCode == ""

rename CountryName countryname
rename CountryCode countrycode

* Convert percentages to decimals
foreach num of numlist 2000(1)2015{
replace health`num' = health`num' / 100
}

* Replace countries with missing if value listed as zero (e.g. Japan has 0 for health spending in one year)
foreach var of varlist health*{
replace `var' = . if `var' == 0
}

* Limit to UN member states
merge 1:1 countrycode using "output\UN_memberstates.dta"
drop if _merge== 1
drop _merge

* Identify most recent year with data

gen healthrecentyear = .
 label variable healthrecentyear "WHO, 2019, Most recent year with data for indicator"
gen healthrecentvalue = .
 label variable healthrecentvalue "WHO, 2019, Most recent observation for indicator"
 
foreach num of numlist 2015(-1)2000{
	replace healthrecentyear = `num' if health`num' != . & healthrecentyear == .
	replace healthrecentvalue = health`num' if health`num' != . & healthrecentvalue == .
	}
	
* Calculate three-year moving average from most recent year with data

order countryname health*, sequential
levelsof healthrecentyear if healthrecentyear >= 2002, local(levels)
foreach rec of local levels{
	local prior = `rec' - 2
	egen healthma_`prior'_`rec' = rowmean(health`prior' - health`rec') if healthrecentyear == `rec'
	}
order healthma_*, sequential

egen health_3yma_recent = rowmax (healthma_*)
 label variable health_3yma_recent "WHO 2019, 3-year moving average from most recent available year"

* 2030 trajectories applying 3-year recent avg. to 2015 and 2030 GDP (constant, 2015 USD)

merge m:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

	* 2015 + average
	for num 2015/2030: gen healthgdp_X = health_3yma_recent * gdp_trajX
	for num 2015/2030: label variable healthgdp_X "Public spending on health (WHO), X"
    order healthgdp_20*, sequential
	egen healthgdp_1530 = rowmean(healthgdp_2015 - healthgdp_2030)
	 label variable healthgdp_1530 "Public spending on health (WHO), avg. 2015-2030"

* Calculate total spending on agriculture in 2015 and 2030		

	* 2015	
	egen tolhealthgdp_2015 = sum(healthgdp_2015)
	sort tolhealthgdp_2015
	carryforward tolhealthgdp_2015, replace
	 label variable tolhealthgdp_2015 "World total public spending on health, 2015"
	 
	* 2030
	egen tolhealthgdp_2030 = sum(healthgdp_2030)
	sort tolhealthgdp_2030
	carryforward tolhealthgdp_2030, replace
	 label variable tolhealthgdp_2030 "World total public spending on health, 2030"

replace healthrecentyear = 0 if healthrecentyear == .
drop if countrycode == ""
	
save "output/WHO_health_pctgdp.dta", replace	


********************************************************************************
********************************************************************************
**** 4. Education
********************************************************************************
********************************************************************************

********************************************************************************
** A . World Bank: WDI
** 		WDI: SE.XPD.TOTL.GD.ZS
** 		General government expenditure on education 
** 		% of GDP
********************************************************************************
wbopendata, indicator(SE.XPD.TOTL.GD.ZS) nometadata year(2000:2017) clear

drop if region == "Aggregates"
replace region = "EAP" if countryname == "Nauru"
drop if region == ""

foreach num of numlist 2000(1)2017{
rename yr`num' ed_gdp`num'
label variable ed_gdp`num' "`num' Gov expend. on education, % GDP"
}

keep countryname countrycode region ed_gdp*

* Convert percentages to decimals
foreach num of numlist 2000(1)2017{
replace ed_gdp`num' = ed_gdp`num' / 100
}

* Supplement with Xinhua News Agency (2016) 
 * http://www.gov.cn/shuju/2016-11/10/content_5131034.htm
 * Total government investment in education 
 * LCU
********************************************************************************
replace ed_gdp2015 = .0426 if countrycode == "CHN"
replace ed_gdp2014 = .0410 if countrycode == "CHN"

* Supplement with World Bank (2018) 
 * http://documents.worldbank.org/curated/en/346771542864299850/pdf/132316-21-11-2018-17-31-9-NigeriaBEUAMF.pdf - Pg. 57
 * Public spending on education
 * % of GDP
replace ed_gdp2015 = .0170 if countrycode == "NGA"

* Supplement with Global Partnership for Education (2018) 
 * https://www.globalpartnership.org/content/pledge-papua-new-guinea-gpes-3rd-replenishment
 * Public expenditure on education (recurrent and capital)
 * LCU
********************************************************************************
replace ed_gdp2016 = .03935 if countrycode == "PNG"
replace ed_gdp2015 = .03431 if countrycode == "PNG"
replace ed_gdp2014 = .03664 if countrycode == "PNG"

* Limit to UN member states
merge 1:1 countrycode using "output\UN_memberstates.dta"
drop if _merge== 1
drop _merge

* Identify most recent year with data

gen edrecentyear = .
 label variable edrecentyear "WDI, 2019, Most recent year with data for indicator"
gen edrecentvalue = .
 label variable edrecentvalue "WDI, 2019, Most recent observation for indicator"
 
foreach num of numlist 2017(-1)2000{
	replace edrecentyear = `num' if ed_gdp`num' != . & edrecentyear == .
	replace edrecentvalue = ed_gdp`num' if ed_gdp`num' != . & edrecentvalue == .
	}
		
* Calculate three-year moving average from most recent year with data

levelsof edrecentyear if edrecentyear >= 2002, local(levels)
foreach rec of local levels{
	local prior = `rec' - 2
	egen edma_`prior'_`rec' = rowmean(ed_gdp`prior' - ed_gdp`rec') if edrecentyear == `rec'
	}
order edma_*, sequential

egen ed_3yma_recent = rowmax (edma_*)
 label variable ed_3yma_recent "UNESCO, 3-year moving average from most recent available year"
 
* 2030 trajectories applying 3-year recent avg. to 2015 and 2030 GDP (constant, 2015 USD)

merge m:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

	* 2015-2030 + average
	for num 2015/2030: gen edgdp_X =  ed_3yma_recent * gdp_trajX
	for num 2015/2030: label variable edgdp_X "Public spending on education (UNESCO), X"
    order edgdp_20*, sequential
	egen edgdp_1530 = rowmean(edgdp_2015 - edgdp_2030)
	 label variable edgdp_1530 "Public spending on education (UNESCO), avg. 2015-2030"
	
* Calculate total spending on education in 2015 and 2030		

	* 2015	
	egen toledgdp_2015 = sum(edgdp_2015)
	sort toledgdp_2015
	carryforward toledgdp_2015, replace
	 label variable toledgdp_2015 "World total public spending on education, 2015"
	 
	* 2030
	egen toledgdp_2030 = sum(edgdp_2030)
	sort toledgdp_2030
	carryforward toledgdp_2030, replace
	 label variable toledgdp_2030 "World total public spending on education, 2030"
		
replace edrecentyear = 0 if edrecentyear == .

save "output\education_pctGDP.dta", replace


********************************************************************************
********************************************************************************
**** 5. Social spending
********************************************************************************
********************************************************************************

********************************************************************************
** A . ILO - World Social Protection Report Data 2017-2019
** 		http://www.ilo.org/gimi/gess/AggregateIndicator.action#expenditure
** 		Downloaded on 1/29/2019
**		Public social protection expenditure, excluding health, latest available year 
**		% of GDP
********************************************************************************
import delimited "input\ILO_SS\africa.csv", clear
keep agrindexpenditure countryiso3 countrynameen
save "input\ILOafrica.dta", replace

import delimited "input\ILO_SS\america.csv", clear
keep agrindexpenditure countryiso3 countrynameen
save "input\ILOamerica.dta", replace

import delimited "input\ILO_SS\arab.csv", clear
keep agrindexpenditure countryiso3 countrynameen
save "input\ILOarab.dta", replace

import delimited "input\ILO_SS\asia.csv", clear
keep agrindexpenditure countryiso3 countrynameen
save "input\ILOasia.dta", replace

import delimited "input\ILO_SS\europe.csv", clear
keep agrindexpenditure countryiso3 countrynameen
save "input\ILOeurope.dta", replace

append using "input\ILOafrica.dta"
append using "input\ILOamerica.dta"
append using "input\ILOarab.dta"
append using "input\ILOasia.dta"

* Destring and replace NA with missing
rename agrindexpenditure sopro_exhealth
 label variable sopro_exhealth "ILO 2017-2019: Public social protection expenditure, excluding health latest available year (% of GDP)"
rename countryiso3 countrycode
rename countrynameen countryname 

replace sopro_exhealth = "." if sopro_exhealth == "NA"
destring sopro_exhealth, replace
replace sopro_exhealth = sopro_exhealth / 100

* Limit to UN member states
replace countrycode = "IRL" if countrycode == "IRE"
merge 1:1 countrycode using "output\UN_memberstates.dta"
drop if _merge== 1
drop _merge

save "output\ILO_SS2017_2019.dta", replace

* Supplement with ILO (2014)
 * https://www.ilo.org/wcmsp5/groups/public/---dgreports/---dcomm/documents/publication/wcms_245201.pdf - Table B.12
 * Public social protection (excluding health care)
 * % of GDP
********************************************************************************

tempfile tmp
copy "input\ILOworld_social_protection2014.xlsx" `tmp'
import excel using `tmp', sheet("table") firstrow clear
erase `tmp'

drop if countryname == ""
 
* Clean variable
replace PP_exchealth = "." if PP_exchealth == "…"
destring PP_exchealth, replace
replace PP_exchealth = PP_exchealth / 100
 label variable PP_exchealth "ILO 2014-2015: Public social protection expenditure, excluding health latest available year (% of GDP)"
 
* Clean country names to add countrycodes
CLEAN_COUNTRY_NAMES countryname
replace countryname = "Samoa" if countryname == "Western Samoa"
replace countryname = "Macedonia, FYR" if countryname == "The Former Yugoslav Republic"

* Add countrycodes
merge 1:1 countryname using "output\UN_memberstates.dta"
drop if _merge==1
drop _merge
save "output\ILO2014_SSpending.dta", replace

* Merge two files keeping most recent available observation
use "output\ILO_SS2017_2019.dta", clear
merge 1:1 countrycode using "output\ILO2014_SSpending.dta", keepusing(PP_exchealth)
drop _merge

gen SS_recent = sopro_exhealth
replace SS_recent = PP_exchealth if sopro_exhealth == .
 label variable SS_recent "Public spending on social protection (ILO), % of GDP, most recent year"	
		
* 2030 trajectories applying only available year to 2015 and 2030 GDP (constant, 2011 USD)

merge 1:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

	* 2015-2030 + avgerage
	for num 2015/2030: gen SSgdp_X = SS_recent * gdp_trajX
	for num 2015/2030: label variable SSgdp_X "Public spending on social protection (ILO), X"
	order SSgdp_20*, sequential
	egen SSgdp_1530 = rowmean(SSgdp_2015 - SSgdp_2030)
	 label variable SSgdp_1530 "Public spending on social protection (ILO), avg. 2015-2030"
	egen SSgdp_1525 = rowmean(SSgdp_2015 - SSgdp_2025)
	 label variable SSgdp_1525 "Public spending on social protection (ILO), avg. 2015-2025"
	
* Calculate total spending on social services in 2015 and 2030		

	* 2015	
	egen tolSSgdp_2015 = sum(SSgdp_2015)
	sort tolSSgdp_2015
	carryforward tolSSgdp_2015, replace
	 label variable tolSSgdp_2015 "World total public spending on social spending, 2015"
	 
	* 2030
	egen tolSSgdp_2030 = sum(SSgdp_2030)
	sort tolSSgdp_2030
	carryforward tolSSgdp_2030, replace
	 label variable tolSSgdp_2030 "World total public spending on social spending, 2030"	
	
save "output\socialspending_pctGDP.dta", replace


********************************************************************************
********************************************************************************
*** 6. Infrastructure
********************************************************************************
********************************************************************************

********************************************************************************
** A . OECD - National Account Statistics
**  	https://data.oecd.org/gga/general-government-spending.htm
**		Downloaded on 2/8/2019
**		General government spending on defence (gross fixed capital formation) 
**		General government spending on defence (gross capital formation) 
********************************************************************************

tempfile tmp
copy "input\OECD_gfcf_defense(8February2019).xlsx" `tmp'
import excel using `tmp', sheet("defence") firstrow clear
erase `tmp'

rename LOCATION countrycode
rename Country countryname
rename Year year
rename Value defense
rename TRANSACT tcode
rename Unit lcu

replace defense = defense * 1000000

keep countrycode countryname year defense lcu tcode
reshape wide defense, i(countrycode year) j(tcode, string)

rename *P51CG *_gfcf
 label variable defense_gfcf "Gen gov gross fixed capital formation: defense"
rename *P5CG *_gcf 
 label variable defense_gcf "Gen gov gross capital formation: defense"
rename *P5_K2CG *_gcfl
 label variable defense_gcfl "Gen gov gross capital formation and acq. less disposals of non-fin. non-prod.assets: defense"
drop defense_gcfl
 
sort countryname year

* Mark countries with zero as missing
 * Costa Rica and Israel
   replace defense_gfcf = . if defense_gfcf == 0
 * Costa Rica
   replace defense_gcf = . if defense_gcf == 0

* Limit year sample to match total gfcf
keep if year >= 2010 & year <= 2015

* Merge with GDP LCU, current
merge 1:1 countrycode year using "output\GDP_lcu_currentLONG.dta"
drop if _merge==2
drop _merge

* Calculate share of GDP for GFCF defense
gen defgdp_gfcf = defense_gfcf / gdp_lcu
 label variable defgdp_gfcf "GFCF defense, % of GDP"

gen defgdp_gcf = defense_gcf / gdp_lcu
 label variable defgdp_gcf "GCF defense, % of GDP"

drop gdp_lcu defense_gcf defense_gfcf

reshape wide defgdp_gfcf defgdp_gcf, i(countrycode) j(year)
order countrycode countryname lcu defgdp_gfcf* defgdp_gcf*, sequential

* Calculate three-year moving average from most recent year with data

* Gross fixed capital formation
egen def_gfcf_3yma = rowmean(defgdp_gfcf2013-defgdp_gfcf2015)
 label variable def_gfcf_3yma "3-year moving avg gen gov gross fixed capital formation (% of GDP): defense"
 
* Gross capital formation 
egen def_gcf_3yma = rowmean(defgdp_gcf2013-defgdp_gcf2015)
 label variable def_gcf_3yma "3-year moving avg gen gov gross capital formation (% of GDP): defense"
 
* Use gross fixed capital formation; if missing replace with gross capital formation
gen def_3yma = def_gfcf_3yma
replace def_3yma = def_gcf_3yma if def_3yma == .
 label variable def_3yma "3-year moving avg gen gov gross fixed capital formation (% of GDP): defense"
  
save "output\OECD_defence_gfcf.dta", replace 
 
 
********************************************************************************
** B . IMF - Investment and capital stock dataset
**		http://data.imf.org/?sk=1CE8A55F-CFA7-4BC0-BCE2-256EE65AC0E4 
**		Downloaded on 2/5/2019
**		General government investment (gross fixed capital formation)
**		2011 PPP
********************************************************************************

tempfile tmp
copy "input\IMF_gfcf(5February2019).xlsx" `tmp'
import excel using `tmp', sheet("Data") firstrow case(lower) clear
erase `tmp'

rename isocode countrycode
rename country countryname
keep countrycode countryname year igov_rppp

drop if year <2000

replace igov_rppp = "." if igov_rppp == "-"
destring igov_rppp, replace

* Variable is in Billions, convert
replace igov_rppp = igov_rppp * 1000000000
 label variable igov_rppp "General government investment (GFCF), billions const. int. $ 2011"

* Merge with GDP PPP, constant 2011 int. $
replace countrycode = "ROU" if countrycode == "ROM"
merge 1:1 countrycode year using "output\GDP_ppp2011constantLONG.dta"
drop if _merge==2
drop _merge

* Calculate share of GDP to gen gov investment in GFCF
gen gg_gfcf = igov_rppp / gdp_ppp
 label variable gg_gfcf "General government investment (GFCF), % of GDP (IMF)"
drop gdp_ppp region igov_rppp

* Reshape wide
reshape wide gg_gfcf, i(countrycode) j(year)

* Limit to UN member states
merge 1:1 countrycode using "output\UN_memberstates.dta"
drop if _merge== 1
drop _merge 

* Afghanistan missing data on GFCF, supplement with integritywatch.org budget estimates (converted to % of GDP using WDI GDP current, LCU) 
* https://iwaweb.org/wp-content/uploads/2017/12/IWA__National-Budget__English_6.pdf
gen gg_gfcf2016 = .06864 if countrycode == "AFG"
gen gg_gfcf2017 = .06542 if countrycode == "AFG"

order gg_gfcf*, sequential
 
* Identify most recent year with data

gen inf_recentyear = .
 label variable inf_recentyear "IMF, 2017, Most recent year with data for indicator"
gen inf_recentvalue = .
 label variable inf_recentvalue "IMF, 2017, Most recent observation for indicator"
 
foreach num of numlist 2017(-1)2000{
	replace inf_recentyear = `num' if gg_gfcf`num' != . & inf_recentyear == .
	replace inf_recentvalue = gg_gfcf`num' if gg_gfcf`num' != . & inf_recentvalue == .
	}
	
* Calculate three-year moving average from most recent year with data

levelsof inf_recentyear if inf_recentyear >= 2002, local(levels)
foreach rec of local levels{
	local prior = `rec' - 2
	egen infma_`prior'_`rec' = rowmean(gg_gfcf`prior' - gg_gfcf`rec') if inf_recentyear == `rec'
	}
order infma_*, sequential

egen inf_3yma_recent = rowmax (infma_*)
 label variable inf_3yma_recent "IMF, 3-year moving average from most recent available year"

* Add defense spending and net out from 3-year average

merge 1:1 countrycode using "output\OECD_defence_gfcf.dta", keepusing(def_3yma)
drop _merge

gen infexdef_3yma_recent = inf_3yma_recent - def_3yma
 replace infexdef_3yma_recent = inf_3yma_recent if def_3yma == .
 label variable infexdef_3yma_recent "GFCF excl. defense (where possible) 3-year moving average from most recent available year"

	
* 2030 trajectories applying 3-year recent avg. to 2015 and 2030 GDP (constant, 2011 USD)

merge m:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

 	* 2015-2030 + average
	for num 2015/2030: gen infgdp_X =  infexdef_3yma_recent * gdp_trajX
	for num 2015/2030: label variable infgdp_X "Public spending on infrastructure excl. defense where possible (IMF), X"
 	order infgdp_20*, sequential
	egen infgdp_1530 = rowmean(infgdp_2015 - infgdp_2030)
	 label variable infgdp_1530 "Public spending on infrastructure excl. defense where possible (IMF), avg. 2015-2030"
	
* Calculate total spending on infrastructure in 2015 and 2030		

	* 2015
	egen tolinfgdp_2015 = sum(infgdp_2015)
	sort tolinfgdp_2015
	carryforward tolinfgdp_2015, replace
	 label variable tolinfgdp_2015 "World total public spending on infrastructure excl. defense where possible, 2015"
	 
	* 2030
	egen tolinfgdp_2030 = sum(infgdp_2030)
	sort tolinfgdp_2030
	carryforward tolinfgdp_2030, replace
	 label variable tolinfgdp_2030 "World total public spending on infrastructure excl. defense where possible, 2030"

replace inf_recentyear = 0 if inf_recentyear == .

save "output\GCFC_infrastructure_pctGDP.dta", replace


********************************************************************************
********************************************************************************
**** 7. Conservation
********************************************************************************
********************************************************************************

********************************************************************************
** A . Waldron et al. 2013
**  	https://www.pnas.org/content/110/29/12144 - Appendix
**		Average annualized total of all spending flows across the years 2001-2008
**		2005 USD
********************************************************************************

* Import data and clean

import excel using "input\Waldron et al Conservation spending (PNAS2013).xlsx", firstrow clear
rename *, lower
keep country total totalaidfunding totaldomesticfunding trustfundsanddebtswaps other
rename (country total) (countryname consinv2008)
keep if countryname!=""
destring totalaidfunding, replace
drop if inlist(countryname, "TOTALS", "% OF ALL AID", "* SERBIA AND MONTENEGRO DATA COMBINED FOR BIODIVERSITY AID")

* Convert from millions and from 2005 dollars to 2015 dollars (https://data.worldbank.org/indicator/NY.GDP.DEFL.ZS?locations=US accessed June 6, 2019)
replace consinv2008 = consinv2008*1000000*1.199729307423139

* Create total for regional allocation (cannot assign to countries, but want to preserve for totals. Won't be able to add to income group chart.
egen cons_region_alloc2008 = sum(consinv2008) if inlist(countryname, "AFRICA", "ASIA AND THE PACIFIC", "BILATERAL, UNSPECIFIED", "CENTRAL AND EASTERN EUROPE", "CENTRAL ASIA, REGIONAL", "EUROPE UNSPECIFIED", "GLOBAL", "LATIN AMERICA AND THE CARIBBEAN", "UNALLOCABLE")

sort cons_region_alloc2008
carryforward cons_region_alloc2008, replace

* Drop regional line items, keep countries
drop if inlist(countryname, "AFRICA", "ASIA AND THE PACIFIC", "BILATERAL, UNSPECIFIED", "CENTRAL AND EASTERN EUROPE", "CENTRAL ASIA, REGIONAL", "EUROPE UNSPECIFIED", "GLOBAL", "LATIN AMERICA AND THE CARIBBEAN", "UNALLOCABLE")
sort countryname

* Standardize country names for merging
CLEAN_COUNTRY_NAMES countryname
replace countryname = "Belarus" if countryname == "Byelarus"
replace countryname = "Samoa" if countryname == "Western Samoa"

* Limit to UN member states
merge m:1 countryname using "output\UN_memberstates.dta"
drop if _merge==1
drop _merge

* Change formatting of some indicators
recast str3 countrycode
recast str4 Region
gen year = 2008

* Convert numbers to percent of GDP by merging in GDP data for 2008
merge 1:1 countrycode year using "output\GDP_US_currentLONG.dta", keepusing(countryname gdp_us_curr)
drop if _merge == 2
drop _merge year

* Create global GDP value to estimate percent of global GDP to regional projects for conservation (will add to sector bar chart, but not to income groupings chart)
egen world_gdp_2008 = sum(gdp_us_curr)

* Create percent of GDP variable by dividing conservation estimates by 2015 deflated gdp for 2008
gen cons_recent = consinv2008/gdp_us_curr*1.111984823914573
label variable cons_recent "Most recent observation for indicator, Conservation = 2008"

gen reg_cons_recent = cons_region_alloc2008/world_gdp_2008*1.111984823914573
label variable reg_cons_recent "Most recent observation for indicator, Conservation = 2008"

* 2030 trajectories applying only available year to 2015 and 2030 GDP (constant, 2015 USD)

merge 1:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

	* 2015-2030 + avgerage
	for num 2015/2030: gen consgdp_X = cons_recent * gdp_trajX
	for num 2015/2030: label variable consgdp_X "Spending on conservation (aid + domestic + others), X"
	order consgdp_20*, sequential
	egen consgdp_1530 = rowmean(consgdp_2015 - consgdp_2030)
	 label variable consgdp_1530 "Spending on conservation (aid + domestic + others), avg. 2015-2030"
	
	* Create world GDP totals to apply
	egen traj_world_gdp_2015 = sum(gdp_traj2015)
	egen traj_world_gdp_2030 = sum(gdp_traj2030)
	
	* Create 2015 and 2030 total amounts of regional spending
	gen reg_cons_gdp_2015 = reg_cons_recent * traj_world_gdp_2015
	gen reg_cons_gdp_2030 = reg_cons_recent * traj_world_gdp_2030
	
* Calculate total spending on conservation in 2015 and 2030		

	* 2015	
	egen tolconsgdp_2015 = sum(consgdp_2015)
	sort tolconsgdp_2015
	* Make sure we have the same value for every observation
	carryforward tolconsgdp_2015, replace
	* Add regional conservation allocation to country-based total
	replace tolconsgdp_2015 = tolconsgdp_2015 + reg_cons_gdp_2015
	 label variable tolconsgdp_2015 "World total public spending on conservation, 2015"
	* 2030
	egen tolconsgdp_2030 = sum(consgdp_2030)
	sort tolconsgdp_2030
	* Make sure we have the same value for every observation
	carryforward tolconsgdp_2030, replace
	* Add regional conservation allocation to country-based total
	replace tolconsgdp_2030 = tolconsgdp_2030 + reg_cons_gdp_2030
	 label variable tolconsgdp_2030 "World total public spending on conservation, 2030"

save "output\PNAS_conservation_pctGDP.dta", replace


********************************************************************************
********************************************************************************
**** 8. Justice (Public safety and order)
********************************************************************************
********************************************************************************

********************************************************************************
** A . IMF - Government Finance Statistics
**  	https://data.imf.org/?sk=5804C5E1-0502-4672-BDCD-671BCDC565A9
**		Downloaded 06/13/2019
**		General government expenditure on public order & safety
**		% of GDP
********************************************************************************

* Import data and clean
 * Will only use General Government expenditure

import excel using "input\IMF_GFS_Expenditure_by_Functions_of_Governm.xlsx", firstrow clear cellrange(A3)

rename (A B Generalgovernment Centralgovernmentexclsocial Budgetarycentralgovernment Extrabudgetarycentralgovernmen) (countryname year pct_order_gen pct_order_cent pct_order_budget pct_order_extrabudget)
destring year, replace
keep countryname year pct_order_gen pct_order_cent pct_order_budget pct_order_extrabudget

foreach v of varlist pct_order_gen pct_order_cent pct_order_budget pct_order_extrabudget{
replace `v' = `v'/100
}

duplicates drop // Congo and Equitorial Guinea have duplicates

* Standardize names for merging
CLEAN_COUNTRY_NAMES countryname

replace countryname = "Serbia" if countryname == "Serbia, Republic of"

* Limit to UN member states
merge m:1 countryname using "output\UN_memberstates.dta"
drop if _merge==1
drop _merge

rename *, lower

* Change formatting of some indicators
recast str3 countrycode
recast str4 region
recast str30 countryname

* Create observation of country and year for every pair
fillin countrycode year
sort countrycode year
by countrycode: carryforward countryname region un_mem, replace
gsort countrycode -year
by countrycode: carryforward countryname region un_mem, replace
sort countrycode _fillin
by countrycode: carryforward countryname region un_mem, replace
sort countrycode year
drop if year == .
drop _fillin

* Find countries for which we have absolutely no data
gen data = (pct_order_gen != .)
egen sum_data = sum(data), by(countrycode)
codebook countrycode if sum_data == 0 // 131 countries with no datapoints whatsoever. see what can be filled by UN Stats

save "input\imf exp on public order and safety.dta", replace


* Supplemented with UNStats (2018)
 * http://data.un.org/Data.aspx?d=SNA&f=group_code%3A301 - 3.1
 * Downloaded on 6/13/2019
 * Government final consumption expenditure on public order and safety at current prices
 * LCU
********************************************************************************

import delimited using "input\UNStats government final consumption expenditure by function.csv", varnames(1) clear

keep if item == "Public order and safety"

* Generate function in country_rename.do
CLEAN_COUNTRY_NAMES countryorarea

rename countryorarea country

keep country year series currency snasystem value
rename value public_order

* Merge in GDP in LCU information http://data.un.org/Data.aspx?q=Table+1.1+Gross+domestic+product+by+expenditures+at+current+prices&d=SNA&f=group_code%3a101
 * Keep only variable on "Equals: GROSS DOMESTIC PRODUCT"
	* Note from UNStats below:
	* Different series numbers (column “Series”) are used to store different time-series versions of national accounts statistics. 
	* Series numbers with two digits (10,20) refer to data compiled following the 1968 SNA national accounts methodology, while series numbers with three digits (100, 200, etc) 
	* refer to data compiled using the 1993 SNA national accounts methodology, and series numbers with four digits (1000, 1100, etc) refer to data compiled using the 2008 SNA national accounts methodology. 
	* In addition to different methodologies, different series numbers are used when data are reported in different currencies, fiscal years, or by different sources. 
	* Furthermore, data are stored under a new series number whenever there are significant changes in compilation practices which make the time series no longer comparable.
preserve
import delimited using "input\UNStats gdp.csv", varnames(1) clear
keep country year series currency snasystem value
CLEAN_COUNTRY_NAMES countryorarea

rename countryorarea country

rename value gdp
tempfile gdp
save `gdp'.dta, replace
restore

merge 1:1 country year series currency using `gdp'.dta
keep if _merge==3
drop _merge

gen unstats_puborder_gdp = public_order/gdp
 
bys country year: gen count_obs = _N
bys country year: gen count_no_obs = _n
keep if count_obs == count_no_obs

fillin country year
drop count_obs count_no_obs _fillin
sort country year

rename country countryname
replace countryname = "Swaziland" if countryname == "Eswatini"
replace countryname = "Czech Republic" if countryname == "Czechia"

merge m:1 countryname using "output\UN_memberstates.dta"
drop if _merge==1
drop _merge
rename *, lower

recast str3 countrycode
recast str30 countryname

fillin countrycode year
sort countrycode _fillin
by countrycode: carryforward countryname region un_mem, replace
sort countrycode year
drop if year == .

save "input\UNStats order and safety.dta", replace

* Merge
use "input\imf exp on public order and safety.dta", clear

merge 1:1 countrycode year using "input\UNStats order and safety.dta"
keep if _merge == 3 // _merge == 1 is 2018, where there is no data anyway, _merge==2 is pre-1980 data, which we don't need
drop _merge

codebook countrycode if sum_data == 0 & unstats_puborder_gdp!=. // 41 countries where UN Stats can fill in and where we have no info from IMF

replace pct_order_gen = unstats_puborder_gdp if sum_data == 0 & unstats_puborder_gdp!=.
gen source = "IMF GFS COFOG" if sum_data > 0 & pct_order_gen!=.
replace source = "UNStats Table 3.1 Government final consumption expenditure by function at current prices" if sum_data == 0 & unstats_puborder_gdp!=.
replace source = "No data" if source == ""

keep if year<=2016 // Want to end on 3 year moving average centered on 2015

* Compute 3 year moving average

encode countrycode, gen(id)
xtset id year
sort id year
by id: gen order_ma = (L1.pct_order_gen + pct_order_gen + F1.pct_order_gen)/3

egen maxyear = max(cond(order_ma!=., year, .)), by(countrycode)

* Keep latest 3 year average for countries with data, as well as latest observation (arbitrary) for countries without data

keep if maxyear == year | (maxyear == . & year == 2016)
keep countryname countrycode region source order_ma

rename order_ma order_recent

* 2030 trajectories applying only available year to 2015 and 2030 GDP (constant, 2015 USD)

merge 1:1 countrycode using "output\gdp_cons2030wide.dta"
drop if _merge==2
drop _merge

	* 2015-2030 + average
	for num 2015/2030: gen justicegdp_X = order_recent * gdp_trajX
	for num 2015/2030: label variable justicegdp_X "Spending on justice (public order and safety), X"
	order justicegdp_20*, sequential
	egen justicegdp_1530 = rowmean(justicegdp_2015 - justicegdp_2030)
	 label variable justicegdp_1530 "Spending on justice (public order and safety), avg. 2015-2030"
	

* Calculate total spending on justice in 2015 and 2030		

	* 2015	
	egen toljusticegdp_2015 = sum(justicegdp_2015)
	sort toljusticegdp_2015
	* Make sure we have the same value for every observation
	carryforward toljusticegdp_2015, replace
	 label variable toljusticegdp_2015 "World total public spending on justice, 2015"

	* 2030
	egen toljusticegdp_2030 = sum(justicegdp_2030)
	sort toljusticegdp_2030
	* Make sure we have the same value for every observation
	carryforward toljusticegdp_2030, replace
	 label variable toljusticegdp_2030 "World total public spending on justice, 2030"

save "output\Justice_pctGDP.dta", replace


********************************************************************************
********************************************************************************
**** 9. Merge all datasets together
********************************************************************************
********************************************************************************

** Agriculture
use "output\ag_shGDP_combined.dta", clear
keep countrycode countryname ag3yma_recent aggdp_20* aggdp_1530 tolaggdp_2015 tolaggdp_2030 

** Health
merge 1:1 countrycode using "output\WHO_health_pctgdp.dta", keepusing(countryname health_3yma_recent healthgdp_20* healthgdp_1530 tolhealthgdp_2015 tolhealthgdp_2030)
drop _merge

** Education
merge 1:1 countrycode using "output\education_pctGDP.dta", keepusing(countryname ed_3yma_recent edgdp_20* edgdp_1530 toledgdp_2015 toledgdp_2030)
drop _merge

** Social spending
merge 1:1 countrycode using "output\socialspending_pctGDP.dta", keepusing(countryname SS_recent SSgdp_20* SSgdp_1530 SSgdp_1525 tolSSgdp_2015 tolSSgdp_2030)
drop _merge

** Infrastructure
merge 1:1 countrycode using "output/GCFC_infrastructure_pctGDP.dta", keepusing(countryname infexdef_3yma_recent infgdp_20* infgdp_1530 tolinfgdp_2015 tolinfgdp_2030)
drop _merge

** Conservation
merge 1:1 countrycode using "output/PNAS_conservation_pctGDP.dta", keepusing(countryname cons_recent consgdp_* consgdp_1530 tolconsgdp_2015 tolconsgdp_2030)
drop _merge

** Justice
merge 1:1 countrycode using "output/Justice_pctGDP.dta", keepusing(countryname order_recent justicegdp_* justicegdp_1530 toljusticegdp_2015 toljusticegdp_2030)
drop _merge

* Calculate country's total % of GDP represented in dataset
 * Ag + health + education + SS + infrastructure + health + conservation
gen total_shGDP = ag3yma_recent + health_3yma_recent + ed_3yma_recent + SS_recent + infexdef_3yma_recent + cons_recent + order_recent
 label variable total_shGDP "Country's total % of GDP spending represented in dataset"

* Add World Bank regions
merge 1:1 countrycode using "output\WBregion.dta", keepusing(region)
drop if _merge==2
drop _merge

* Add World Bank income groups (2015)
merge 1:1 countrycode using "output\WBincomegroups_all.dta", keepusing(incomegroup2015)
drop if _merge==2
drop _merge 

* Population out to 2030 
merge 1:1 countrycode using "output\pop_all_un62017_2030.dta", keepusing(pop2015-pop2030)
drop _merge
egen pop_avg1530 = rowmean(pop2015 - pop2030)
 label variable pop_avg1530 "Population, annual avg. 2015-2030"
egen pop_avg1525 = rowmean(pop2015 - pop2025)
 label variable pop_avg1525 "Population, annual avg. 2015-2025"
egen pop_avg2630 = rowmean(pop2026 - pop2030)
 label variable pop_avg2630 "Population, annual avg. 2026-2030"

* Add total GDP (US$ constant)
merge 1:1 countrycode using "output\gdp_cons2030wide.dta", keepusing(gdp_traj2015 gdp_traj2025 gdp_traj2030 gdp_traj_avg1530)
drop if _merge == 2
drop _merge

save "output\merged_full_cons.dta", replace

********************************************************************************
** A . Total country spending 
********************************************************************************

* For each country, identify number of non-missing sectors
********************************************************************************
 * 2015
order aggdp_2015 edgdp_2015 healthgdp_2015 SSgdp_2015 infgdp_2015 consgdp_2015 justicegdp_2015
egen count2015 = rownonmiss( aggdp_2015- justicegdp_2015)
 label variable count2015 "Number of sectors with data in 2015"
 * 2025
order aggdp_2025 edgdp_2025 healthgdp_2025 SSgdp_2025 infgdp_2025 consgdp_2025 justicegdp_2025
egen count2025 = rownonmiss( aggdp_2025- justicegdp_2025)
 label variable count2025 "Number of sectors with data in 2025"
 * 2030
order aggdp_2030 edgdp_2030 healthgdp_2030 SSgdp_2030 infgdp_2030 consgdp_2030 justicegdp_2030
egen count2030 = rownonmiss( aggdp_2030- justicegdp_2030)
 label variable count2030 "Number of sectors with data in 2030"
 * 2015-2030
order aggdp_1530 edgdp_1530 healthgdp_1530 SSgdp_1530 infgdp_1530 consgdp_1530 justicegdp_1530
egen count1530 = rownonmiss( aggdp_1530- justicegdp_1530)
 label variable count1530 "Number of sectors with data for annual avg. 2015-2030" 

* For each country, identify total SDG spending in 2015 and 2030 and annual average
********************************************************************************
 * 2015
egen cntytolgdp_2015 = rowtotal( aggdp_2015- justicegdp_2015)
 label variable cntytolgdp_2015 "Total spending 2015 on the SDGs (Ag + Ed + Health + Social Spending + Infrastructure + Conservation + Justice)"
replace cntytolgdp_2015 = . if count2015 == 0

  * 2025
egen cntytolgdp_2025 = rowtotal( aggdp_2025- justicegdp_2025)
 label variable cntytolgdp_2025 "Total spending 2025 on the SDGs (Ag + Ed + Health + Social Spending + Infrastructure +  Conservation + Justice)"
replace cntytolgdp_2025 = . if count2025 == 0 
 
 * 2030
egen cntytolgdp_2030 = rowtotal( aggdp_2030- justicegdp_2030)
 label variable cntytolgdp_2030 "Total spending 2030 on the SDGs (Ag + Ed + Health + Social Spending + Infrastructure +  Conservation + Justice)"
replace cntytolgdp_2030 = . if count2030 == 0 
 
 * 2015-2030 annual average
egen cntytolgdp_1530 = rowtotal ( aggdp_1530- justicegdp_1530)
 label variable cntytolgdp_1530 "Total avg. annual spending 2015-2030 on the SDGs (Ag + Ed + Health + Social Spending + Infrastructure + Conservation + Justice)"
replace cntytolgdp_1530 = . if count1530 == 0

********************************************************************************
** B . Total world spending in 2015 and 2030
********************************************************************************

* Total world GDP in 2015
********************************************************************************
egen tolgdp_2015 = sum(gdp_traj2015)
 label variable tolgdp_2015 "Total world GDP 2015"

* Total world spending in 2015
********************************************************************************
gen worldtolSDG_2015 =  tolaggdp_2015 + toledgdp_2015 + tolhealthgdp_2015 + tolSSgdp_2015 + tolinfgdp_2015 + tolconsgdp_2015 + toljusticegdp_2015
 label variable worldtolSDG_2015 "Total world spending 2015 on the SDGs (Ag + Ed + Health + Social Spending + Infrastructure + Conservation + Justice)"

* Total world GDP in 2030
********************************************************************************
egen tolgdp_2030 = sum(gdp_traj2030)
 label variable tolgdp_2030 "Total world GDP 2030"

* Total world spending in 2030
********************************************************************************
gen worldtolSDG_2030 =  tolaggdp_2030 + toledgdp_2030 + tolhealthgdp_2030 + tolSSgdp_2030 + tolinfgdp_2030 + tolconsgdp_2030 + toljusticegdp_2030
 label variable worldtolSDG_2030 "Total world spending 2030 on the SDGs (Ag + Ed + Health + Social Spending + Infrastructure + Conservation + Justice)"

********************************************************************************
** C . Spending in 2015, 2030, and avg. 2015-2030, by income group
********************************************************************************

* By sector
********************************************************************************
foreach var of varlist aggdp_2015 edgdp_2015 healthgdp_2015 SSgdp_2015 infgdp_2015 consgdp_2015 justicegdp_2015{
	bysort incomegroup2015: egen inc`var' = sum(`var')
	sort incomegroup2015 inc`var'
	by incomegroup2015: carryforward inc`var', replace
	 label variable inc`var' "Public spending in 2015, by incomegroup"
}

foreach var of varlist aggdp_2030 edgdp_2030 healthgdp_2030 SSgdp_2030 infgdp_2030 consgdp_2030 justicegdp_2030{
	bysort incomegroup2015: egen inc`var' = sum(`var')
	sort incomegroup2015 inc`var'
	by incomegroup2015: carryforward inc`var', replace
	label variable inc`var' "Public spending in 2030, by incomegroup"
}

foreach var of varlist aggdp_1530 edgdp_1530 healthgdp_1530 SSgdp_1530 infgdp_1530 consgdp_1530 justicegdp_1530{
	bysort incomegroup2015: egen inc`var' = sum(`var')
	sort incomegroup2015 inc`var'
	by incomegroup2015: carryforward inc`var', replace
	 label variable inc`var' "Public spending avg. annual 2015-2030, by incomegroup"
}

* Total spending
********************************************************************************
egen inctolSDG_2015 = rowtotal(incaggdp_2015 - incjusticegdp_2015)
 label variable inctolSDG_2015 "Total spending on SDGs in 2015, by incomegroup"

egen inctolSDG_2030 = rowtotal(incaggdp_2030 - incjusticegdp_2030)
 label variable inctolSDG_2030 "Total spending on SDGs in 2030, by incomegroup"
 
egen inctolSDG_1530 = rowtotal(incaggdp_1530 - incjusticegdp_1530)
 label variable inctolSDG_1530 "Avg. annual spending on SDGs in 2015-2030, by incomegroup" 

********************************************************************************
** D . Spending in 2015 and 2030, by region
********************************************************************************

* By sector
********************************************************************************
foreach var of varlist aggdp_2015 edgdp_2015 healthgdp_2015 SSgdp_2015 infgdp_2015 consgdp_2015 justicegdp_2015{
	bysort region: egen r`var' = sum(`var')
	sort region r`var'
	by region: carryforward r`var', replace
	 label variable r`var' "Public spending in 2015, by region"
}

foreach var of varlist aggdp_2030 edgdp_2030 healthgdp_2030 SSgdp_2030 infgdp_2030 consgdp_2030 justicegdp_2030{
	bysort region: egen r`var' = sum(`var')
	sort region r`var'
	by region: carryforward r`var', replace
	 label variable r`var' "Public spending in 2030, by region"
}

* Total spending
********************************************************************************
egen rtolSDG_2015 = rowtotal(raggdp_2015 - rjusticegdp_2015)
 label variable rtolSDG_2015 "Total spending on SDGs in 2015, by region"

egen rtolSDG_2030 = rowtotal(raggdp_2030 - rjusticegdp_2030)
 label variable rtolSDG_2030 "Total spending on SDGs in 2030, by region"
   
********************************************************************************
** E . Per capita spending in 2015, 2025, 2030, and avg. 2015-2030, by sector
********************************************************************************

foreach var of varlist aggdp_2015 healthgdp_2015 edgdp_2015 SSgdp_2015 infgdp_2015 consgdp_2015 justicegdp_2015 cntytolgdp_2015 gdp_traj2015{
	gen pc_`var' = `var' / pop2015
	 label variable pc_`var' "Per capita spending, 2015"
	}

foreach var of varlist aggdp_2025 healthgdp_2025 edgdp_2025 SSgdp_2025 infgdp_2025 consgdp_2025 justicegdp_2025 cntytolgdp_2025 gdp_traj2025{
	gen pc_`var' = `var' / pop2025
	 label variable pc_`var' "Per capita spending, 2025"
	}	

foreach var of varlist aggdp_2030 healthgdp_2030 edgdp_2030 SSgdp_2030 infgdp_2030 consgdp_2030 justicegdp_2030 cntytolgdp_2030 gdp_traj2030{
	gen pc_`var' = `var' / pop2030
	 label variable pc_`var' "Per capita spending, 2030"
	}

foreach var of varlist aggdp_1530 healthgdp_1530 edgdp_1530 SSgdp_1530 infgdp_1530 consgdp_1530 justicegdp_1530 cntytolgdp_1530 {
	gen pc_`var' = `var' / pop_avg1530
	 label variable pc_`var' "Per capita spending, avg. 2015-2030"
	}
	
gen pc_SSgdp_1525 = SSgdp_1525/pop_avg1525
 label variable pc_SSgdp_1525 "Per capita spending, avg. 2015-2015" 

********************************************************************************
********************************************************************************
**** 10. Fill in missing values
********************************************************************************
********************************************************************************

********************************************************************************	
** A . Fill in spending for missing sectors with regional income group averages, weighted by population
**  	Interpolating with % of GDP in 2015, then applying that percentage forward for future years to 
**  	Apply same approach as other countries with data 
********************************************************************************	

foreach v of varlist ag3yma_recent health_3yma_recent ed_3yma_recent SS_recent infexdef_3yma_recent cons_recent order_recent {
	* Create % of GDP average by sector for each income group and region
	egen avg_share_`v' = wtmean(`v'), weight(pop2015) by(incomegroup2015 region)
	gen a_avg_share_`v'=`v'
	}
	* Haiti is the only low income LCN region country so apply low income & SSF region average to Haiti
	replace avg_share_order_recent=0.0074694363454 if countryname=="Haiti"

rename avg_share_ag3yma_recent avg_share_aggdp
rename a_avg_share_ag3yma_recent a_aggdp_shGDP
rename avg_share_health_3yma_recent avg_share_healthgdp
rename a_avg_share_health_3yma_recent a_healthgdp_shGDP
rename avg_share_ed_3yma_recent avg_share_edgdp
rename a_avg_share_ed_3yma_recent a_edgdp_shGDP
rename avg_share_SS_recent avg_share_SSgdp
rename a_avg_share_SS_recent a_SSgdp_shGDP
rename avg_share_infexdef_3yma_recent avg_share_infgdp
rename a_avg_share_infexdef_3yma_recent a_infgdp_shGDP
rename avg_share_cons_recent avg_share_consgdp
rename a_avg_share_cons_recent a_consgdp_shGDP
rename avg_share_order_recent avg_share_justicegdp
rename a_avg_share_order_recent a_justicegdp_shGDP

local sector_list "aggdp healthgdp edgdp SSgdp infgdp consgdp justicegdp"
 foreach i of local sector_list {
	*Create copy of % GDP average by sector to fill in missing values
	replace a_`i'_shGDP = avg_share_`i' if a_`i'_shGDP==.
}

*Create country total of %GDP with interpolated values
gen a_total_shGDP=a_aggdp_shGDP+a_healthgdp_shGDP+a_edgdp_shGDP+a_SSgdp_shGDP+a_infgdp_shGDP+a_consgdp_shGDP+a_justicegdp_shGDP
	
foreach 1 of numlist 2015 2025 2030 {
	
	* Create copy of total country AG spending
	gen a_aggdp_`1' = aggdp_`1'
	* Replace with average above if missing
	replace a_aggdp_`1' = avg_share_aggdp*gdp_traj`1' if aggdp_`1'== .
	label var a_aggdp_`1' "Variable aggdp_`1' with missing val filled in w/ income-regional avg"

	* Create copy of total country HEALTH spending
	gen a_healthgdp_`1' = healthgdp_`1'
	* Replace with average above if missing
	replace a_healthgdp_`1' = avg_share_healthgdp*gdp_traj`1' if healthgdp_`1'== .
	label var a_healthgdp_`1' "Variable healthgdp_`1' with missing val filled in w/ income-regional avg"
	
	* Create copy of total country EDU spending
	gen a_edgdp_`1' =  edgdp_`1'
	* Replace with average above if missing
	replace a_edgdp_`1' = avg_share_edgdp*gdp_traj`1' if edgdp_`1'== .
	label var a_edgdp_`1' "Variable edgdp_`1' with missing val filled in w/ income-regional avg"
	
	* Create copy of total country SS spending
	gen a_SSgdp_`1' =  SSgdp_`1'
	* Replace with average above if missing
	replace a_SSgdp_`1' = avg_share_SSgdp*gdp_traj`1' if SSgdp_`1'== .
	label var a_SSgdp_`1' "Variable SSgdp_`1' with missing val filled in w/ income-regional avg"
	
	* Create copy of total country INF spending
	gen a_infgdp_`1' =  infgdp_`1'
	* Replace with average above if missing
	replace a_infgdp_`1' = avg_share_infgdp*gdp_traj`1' if infgdp_`1'== .
	label var a_infgdp_`1' "Variable infgdp_`1' with missing val filled in w/ income-regional avg"
	
	* Create copy of total country CONS spending
	gen a_consgdp_`1' = consgdp_`1'
	* Replace with average above if missing
	replace a_consgdp_`1' = avg_share_consgdp*gdp_traj`1' if consgdp_`1'== .
	label var a_consgdp_`1' "Variable consgdp_`1' with missing val filled in w/ income-regional avg"
	
	* Create copy of total country JUSTICE spending
	gen a_justicegdp_`1' = justicegdp_`1'
	* Replace with average above if missing
	replace a_justicegdp_`1' = avg_share_justicegdp*gdp_traj`1' if justicegdp_`1'== .
	label var a_justicegdp_`1' "Variable justicegdp_`1' with missing val filled in w/ income-regional avg"

foreach sec of varlist aggdp_`1' healthgdp_`1' edgdp_`1' SSgdp_`1' infgdp_`1' consgdp_`1' justicegdp_`1' {

	* Create copy of country per capita spending
	gen a_pc_`sec' = pc_`sec'
	* Replace with filled in total estimates divided by population if missing
	replace a_pc_`sec' = a_`sec'/pop`1' if pc_`sec' == .
	label var a_pc_`sec' "Variable pc_`sec' with missing val filled in w/ income-regional avg per capita times population"

	* Create world total 
	egen a_tol`sec' = sum(a_`sec')
	label var a_tol`sec' "Filled-in world total public spending from `sec'"
	* Create income group total by sector and total by income
	bysort incomegroup2015: egen a_inc`sec' = sum(a_`sec')
	sort incomegroup2015 a_inc`sec'
	by incomegroup2015: carryforward a_inc`sec', replace
	label variable a_inc`sec' "Public spending in `sec', by incomegroup filled in"
	* Create regional group total by sector and total by region
	bysort region: egen a_r`sec' = sum(a_`sec')
	sort region a_r`sec'
	by region: carryforward a_r`sec', replace
	label variable a_r`sec' "Public spending in `sec', by region filled in"
	}
	}
	
	foreach 1 of numlist 2015 2025 2030 {
	* Create country pc totals
	gen a_pc_cntytolgdp_`1' =(a_pc_aggdp_`1'+a_pc_edgdp_`1'+a_pc_healthgdp_`1'+a_pc_infgdp_`1'+a_pc_SSgdp_`1'+a_pc_consgdp_`1'+a_pc_justicegdp_`1')
	label var a_pc_cntytolgdp_`1' "Per capita spending total, `1'"
	* Create country totals
	gen a_cntytolgdp_`1' =(a_aggdp_`1'+a_healthgdp_`1'+a_edgdp_`1'+a_SSgdp_`1'+a_infgdp_`1'+a_consgdp_`1'+a_justicegdp_`1')
	label variable a_cntytolgdp_`1' "Total spending `1' on the SDGs (Ag + Ed + Health + Social Spending + Infrastructure + Conservation + Justice)"
	* Total spending by income group
	gen a_inctolSDG_`1' = (a_incaggdp_`1'+a_incedgdp_`1'+a_inchealthgdp_`1'+a_incinfgdp_`1'+a_incSSgdp_`1'+a_incconsgdp_`1'+a_incjusticegdp_`1')
	label variable a_inctolSDG_`1' "Total spending on SDGs in `1', by incomegroup filled in"
	* Per capita spending by income group
	egen a_incpc_cntytolgdp_`1' = wtmean(a_pc_cntytolgdp_`1'), weight(pop`1') by(incomegroup2015)
	label variable a_incpc_cntytolgdp_`1' "Per capita spending on SDGs in `1', by income group filled in"
	* Total spending by regional group
	gen a_rtolSDG_`1' =(a_raggdp_`1'+a_redgdp_`1'+a_rhealthgdp_`1'+a_rinfgdp_`1'+a_rSSgdp_`1'+a_rconsgdp_`1'+a_rjusticegdp_`1')
	label variable a_rtolSDG_`1' "Total spending on SDGs in `1', by region filled in"
	* Per capita spending by regional group
	egen a_rpc_cntytolgdp_`1' = wtmean(a_pc_cntytolgdp_`1'), weight(pop`1') by(region)
	label variable a_rpc_cntytolgdp_`1' "Per capita spending on SDGs in `1', by region filled in"
	* Recreate world totals
	gen a_worldtolSDG_`1' =(a_tolaggdp_`1'+a_toledgdp_`1'+a_tolhealthgdp_`1'+a_tolinfgdp_`1'+a_tolSSgdp_`1'+a_tolconsgdp_`1'+a_toljusticegdp_`1')
}
	
********************************************************************************
** B . Per capita total, by income group, and by region
********************************************************************************

* Total
********************************************************************************

* 2015
foreach var of varlist pc_aggdp_2015 pc_healthgdp_2015 pc_edgdp_2015 pc_SSgdp_2015 pc_infgdp_2015 pc_consgdp_2015 pc_justicegdp_2015 pc_cntytolgdp_2015{
	egen tol`var' = wtmean(`var' ), weight(pop2015)
	 label variable tol`var' "Per capita spending world total, 2015"
	}
 
* 2030
foreach var of varlist pc_aggdp_2030 pc_healthgdp_2030 pc_edgdp_2030 pc_SSgdp_2030 pc_infgdp_2030 pc_consgdp_2030 pc_justicegdp_2030 pc_cntytolgdp_2030{
	egen tol`var' = wtmean(`var' ), weight(pop2030)
	 label variable tol`var' "Per capita spending world total, 2030"
	} 

* Income Group
********************************************************************************

* 2015
foreach var of varlist pc_aggdp_2015 pc_healthgdp_2015 pc_edgdp_2015 pc_SSgdp_2015 pc_infgdp_2015 pc_consgdp_2015 pc_justicegdp_2015{
	bysort incomegroup2015: egen inc`var' = wtmean(`var' ), weight(pop2015)
	 label variable inc`var' "Per capita spending by WB 2015 income groups, 2015"
	}
	bysort incomegroup2015: egen incpc_cntytolgdp_2015 = wtmean(pc_cntytolgdp_2015) if count2015 == 7, weight(pop2015)
	gsort incomegroup2015 -incpc_cntytolgdp_2015
	by incomegroup2015: carryforward incpc_cntytolgdp_2015, replace
	 label variable incpc_cntytolgdp_2015 "Per capita spending by WB 2015 income groups, 2015"
	
* 2030
foreach var of varlist pc_aggdp_2030 pc_healthgdp_2030 pc_edgdp_2030 pc_SSgdp_2030 pc_infgdp_2030 pc_consgdp_2030 pc_justicegdp_2030{
	bysort incomegroup2015: egen inc`var' = wtmean(`var' ), weight(pop2030)
	 label variable inc`var' "Per capita spending by WB 2015 income groups, 2030"
	}
	bysort incomegroup2015: egen incpc_cntytolgdp_2030 = wtmean(pc_cntytolgdp_2030 ) if count2030 == 7, weight(pop2030)
	gsort incomegroup2015 -incpc_cntytolgdp_2030
	by incomegroup2015: carryforward incpc_cntytolgdp_2030, replace
	 label variable incpc_cntytolgdp_2030 "Per capita spending by WB 2015 income groups, 2030"
	 
* 2015-2030
foreach var of varlist pc_aggdp_1530 pc_healthgdp_1530 pc_edgdp_1530 pc_SSgdp_1530 pc_infgdp_1530 pc_consgdp_1530 pc_justicegdp_1530{
	bysort incomegroup2015: egen inc`var' = wtmean(`var' ), weight(pop_avg1530)
	 label variable inc`var' "Per capita spending by WB 2015 incomegroups, avg. 2015-2030"
	} 	
 
* Region
********************************************************************************
* 2015
foreach var of varlist pc_aggdp_2015 pc_healthgdp_2015 pc_edgdp_2015 pc_SSgdp_2015 pc_infgdp_2015 pc_consgdp_2015 pc_justicegdp_2015 pc_cntytolgdp_2015{
	bysort region: egen r`var' = wtmean(`var' ), weight(pop2015)
	 label variable r`var' "Per capita spending by region, 2015"
	}
	
* 2030
foreach var of varlist pc_aggdp_2030 pc_healthgdp_2030 pc_edgdp_2030 pc_SSgdp_2030 pc_infgdp_2030 pc_consgdp_2030 pc_justicegdp_2030 pc_cntytolgdp_2030{
	bysort region: egen r`var' = wtmean(`var' ), weight(pop2030)
	 label variable r`var' "Per capita spending by region, 2030"
	}
	
save "output\merged_full_totals_cons.dta", replace

********************************************************************************
********************************************************************************
**** 11. Building related datasets
********************************************************************************
********************************************************************************

use "output\merged_full_totals_cons.dta", clear

* Take natural logarithm of variables

gen lpc_cntytolgdp_2015 = ln(pc_cntytolgdp_2015)
 label variable lpc_cntytolgdp_2015 "Logged p.c. SDG spending, 2015 (USD const. 2015)"
gen la_pc_cntytolgdp_2015 = ln(a_pc_cntytolgdp_2015)
 label variable la_pc_cntytolgdp_2015 "Logged p.c. SDG spending, 2015 (USD const. 2015) filled in"
gen lpc_cntytolgdp_2030 = ln(pc_cntytolgdp_2030)
 label variable lpc_cntytolgdp_2030 "Logged p.c. SDG spending, 2030 (USD const. 2015)"
gen la_pc_cntytolgdp_2030 = ln(a_pc_cntytolgdp_2030)
 label variable la_pc_cntytolgdp_2030 "Logged p.c. SDG spending, 2030 (USD const. 2015) filled in"

gen lpc_gdp_traj2015 = ln(pc_gdp_traj2015)
  label variable lpc_gdp_traj2015 "Logged GDP per capita, 2015 (USD cons. 2015)"
gen lpc_gdp_traj2030 = ln(pc_gdp_traj2030)
  label variable lpc_gdp_traj2030 "Logged GDP per capita, 2030 (USD cons. 2015)"

********************************************************************************
** A . Apply 1.13x GDP/pc multiplier to later years
********************************************************************************

gen gdp_pc_growth1530 = pc_gdp_traj2030/pc_gdp_traj2015 - 1
gen gdp_pc_growth1525 = pc_gdp_traj2025/pc_gdp_traj2015 - 1

* Inflating by factor of 1.13
gen pc_spending_growth1530 = gdp_pc_growth1530 * 1.13
gen pc_spending_growth1525 = gdp_pc_growth1525 * 1.13

foreach sec in aggdp healthgdp edgdp SSgdp infgdp consgdp justicegdp{
	gen pc_`sec'_2030_alt = pc_`sec'_2015 * (1 + pc_spending_growth1530)
	gen tol_`sec'_2030_alt = pc_`sec'_2030_alt*pop2030
	
	gen a_pc_`sec'_2030_alt = a_pc_`sec'_2015 * (1 + pc_spending_growth1530)
	gen a_tol_`sec'_2030_alt = a_pc_`sec'_2030_alt*pop2030
	
	gen pc_`sec'_2025_alt = pc_`sec'_2015 * (1 + pc_spending_growth1525)
	gen tol_`sec'_2025_alt = pc_`sec'_2025_alt*pop2025
	
	gen a_pc_`sec'_2025_alt = a_pc_`sec'_2015 * (1 + pc_spending_growth1525)
	gen a_tol_`sec'_2025_alt = a_pc_`sec'_2025_alt*pop2025
}

*Generate totals
gen pc_cntytolgdp_2030_alt = pc_cntytolgdp_2015*(1 + pc_spending_growth1530)
gen cntytolgdp_2030_alt = pc_cntytolgdp_2030_alt*pop2030

gen pc_cntytolgdp_2025_alt = pc_cntytolgdp_2015*(1 + pc_spending_growth1525)
gen cntytolgdp_2025_alt = pc_cntytolgdp_2025_alt*pop2025

gen a_pc_cntytolgdp_2030_alt = a_pc_cntytolgdp_2015*(1 + pc_spending_growth1530)
gen a_cntytolgdp_2030_alt = a_pc_cntytolgdp_2030_alt*pop2030

gen a_pc_cntytolgdp_2025_alt = a_pc_cntytolgdp_2015*(1 + pc_spending_growth1525)
gen a_cntytolgdp_2025_alt = a_pc_cntytolgdp_2025_alt*pop2025

save "output\merged_full_totals_cons.dta", replace

********************************************************************************
** B . Merge with DRM and ODA data
**  	UNU-WIDER Governament revenue dataset
**  	https://www.wider.unu.edu/project/government-revenue-dataset
********************************************************************************

use "input\Merged.dta", clear
rename iso countrycode
drop if countrycode == "" // empty observations at bottom

replace countrycode = "XKX" if countrycode == "KSV"
replace countrycode = "TLS" if countrycode == "TMP"
replace countrycode = "PSE" if countrycode == "WBG"

merge 1:1 countrycode year using "input\WDI indicators 0605.dta"
tab countrycode if _merge==1 // Anguilla and Montserrat
keep if _merge==3
drop _merge

preserve
import excel using "https://databank.worldbank.org/data/download/site-content/CLASS.xls", firstrow cellrange(A5) clear
rename (Code Incomegroup) (countrycode incgroup)
drop if inlist(incgroup, "x", "")
keep countrycode incgroup
tempfile incgroup
save `incgroup'.dta, replace
restore

merge m:1 countrycode using `incgroup'.dta
drop if _merge == 2
drop _merge

* Combined income variable, prefer GNI per capita and will fill in with GDP per capita if necessary
gen income = gni_pc
replace income = gdp_pc if income == .
 label var income "Income per capita, using GNI pc first, then filling in with GDP pc"

* Log income variable
gen ln_income = ln(income)
 label var ln_income "Log of Income per capita, using GNI pc first, then filling in with GDP pc"

* Keep latest value
bys countrycode: carryforward rev_ex_gr_inc_sc, replace

* Per capita revenue excluding grants and including social contributions
gen rev_pc = income * rev_ex_gr_inc_sc/100
 label var rev_pc "Revenue per capita excluding grants and including social contributions"
gen ln_rev_pc = ln(rev_pc)
 label var ln_rev_pc "Log of Revenue per capita excluding grants and including social contributions"

gen rev_pc_grants = income * rev_inc_sc/100
 label var rev_pc_grants "Revenue per capita including grants and including social contributions"
gen ln_rev_pc_grants = ln(rev_pc_grants)
 label var ln_rev_pc_grants "Log of Revenue per capita including grants and including social contributions"

* Grants as percent of revenue
gen gr_rev_pct = grants/rev_ex_gr_inc_sc*100
 label var gr_rev_pct "Grants as a share of Revenue excluding grants including social contribs"

* Log Net ODA per capita
gen ln_netoda_pc = ln(netoda_pc + 1)
 label var ln_netoda_pc "Log of Net ODA received per capita (current US$)"

keep if year == 2015

merge 1:1 countrycode using "output\merged_full_totals_cons.dta", keepusing(pc_gdp_traj2015 gdp_traj2015 gdp_traj2030 incomegroup2015 pop2015 pop2030 pop_avg1530)
drop if _merge == 1 // Restricts it to UN member states, resulting number of countries is 193
drop _merge

gen rev_gdp_2015 = rev_ex_gr_inc_sc/100*gdp_traj2015
gen pc_rev_gdp_2015 = rev_gdp_2015/pop2015

save "output\spending plus DRM ODA.dta", replace 